This report presents a comprehensive analysis of the Steam games dataset. The analysis includes data description, preprocessing, descriptive statistics, distribution fitting, hypothesis testing, ANOVA, and regression analysis. The goal is to gain insights into the factors that influence game success on the Steam platform.
# Install required packages if not already installed
required_packages <- c("tidyverse", "ggplot2", "dplyr", "tidyr", "lubridate",
"stringr", "moments", "car", "fitdistrplus", "corrplot",
"knitr", "kableExtra", "gridExtra", "scales", "RColorBrewer",
"plotly", "viridis", "htmlwidgets")
# Function to check and install missing packages
check_and_install <- function(pkg) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
# Check and install all required packages
invisible(sapply(required_packages, check_and_install))# Load the dataset
file_path <- "C:/Users/Surya/Desktop/New folder/steam.csv"
steam_data <- read.csv(file_path, stringsAsFactors = FALSE)
# Dataset dimensions
cat(sprintf("Dataset dimensions: %d rows and %d columns\n",
nrow(steam_data), ncol(steam_data)))## Dataset dimensions: 27075 rows and 18 columns
# Display the first few rows
kable(head(steam_data, 5)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
font_size = 11) %>%
scroll_box(width = "100%")| appid | name | release_date | english | developer | publisher | platforms | required_age | categories | genres | steamspy_tags | achievements | positive_ratings | negative_ratings | average_playtime | median_playtime | owners | price |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | Counter-Strike | 2000-11-01 | 1 | Valve | Valve | windows;mac;linux | 0 | Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled | Action | Action;FPS;Multiplayer | 0 | 124534 | 3339 | 17612 | 317 | 10000000-20000000 | 7.19 |
| 20 | Team Fortress Classic | 1999-04-01 | 1 | Valve | Valve | windows;mac;linux | 0 | Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled | Action | Action;FPS;Multiplayer | 0 | 3318 | 633 | 277 | 62 | 5000000-10000000 | 3.99 |
| 30 | Day of Defeat | 2003-05-01 | 1 | Valve | Valve | windows;mac;linux | 0 | Multi-player;Valve Anti-Cheat enabled | Action | FPS;World War II;Multiplayer | 0 | 3416 | 398 | 187 | 34 | 5000000-10000000 | 3.99 |
| 40 | Deathmatch Classic | 2001-06-01 | 1 | Valve | Valve | windows;mac;linux | 0 | Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled | Action | Action;FPS;Multiplayer | 0 | 1273 | 267 | 258 | 184 | 5000000-10000000 | 3.99 |
| 50 | Half-Life: Opposing Force | 1999-11-01 | 1 | Gearbox Software | Valve | windows;mac;linux | 0 | Single-player;Multi-player;Valve Anti-Cheat enabled | Action | FPS;Action;Sci-fi | 0 | 5250 | 288 | 624 | 415 | 5000000-10000000 | 3.99 |
## 'data.frame': 27075 obs. of 18 variables:
## $ appid : int 10 20 30 40 50 60 70 80 130 220 ...
## $ name : chr "Counter-Strike" "Team Fortress Classic" "Day of Defeat" "Deathmatch Classic" ...
## $ release_date : chr "2000-11-01" "1999-04-01" "2003-05-01" "2001-06-01" ...
## $ english : int 1 1 1 1 1 1 1 1 1 1 ...
## $ developer : chr "Valve" "Valve" "Valve" "Valve" ...
## $ publisher : chr "Valve" "Valve" "Valve" "Valve" ...
## $ platforms : chr "windows;mac;linux" "windows;mac;linux" "windows;mac;linux" "windows;mac;linux" ...
## $ required_age : int 0 0 0 0 0 0 0 0 0 0 ...
## $ categories : chr "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" "Multi-player;Valve Anti-Cheat enabled" "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" ...
## $ genres : chr "Action" "Action" "Action" "Action" ...
## $ steamspy_tags : chr "Action;FPS;Multiplayer" "Action;FPS;Multiplayer" "FPS;World War II;Multiplayer" "Action;FPS;Multiplayer" ...
## $ achievements : int 0 0 0 0 0 0 0 0 0 33 ...
## $ positive_ratings: int 124534 3318 3416 1273 5250 2758 27755 12120 3822 67902 ...
## $ negative_ratings: int 3339 633 398 267 288 684 1100 1439 420 2419 ...
## $ average_playtime: int 17612 277 187 258 624 175 1300 427 361 691 ...
## $ median_playtime : int 317 62 34 184 415 10 83 43 205 402 ...
## $ owners : chr "10000000-20000000" "5000000-10000000" "5000000-10000000" "5000000-10000000" ...
## $ price : num 7.19 3.99 3.99 3.99 3.99 3.99 7.19 7.19 3.99 7.19 ...
## appid name release_date english
## Min. : 10 Length:27075 Length:27075 Min. :0.0000
## 1st Qu.: 401230 Class :character Class :character 1st Qu.:1.0000
## Median : 599070 Mode :character Mode :character Median :1.0000
## Mean : 596204 Mean :0.9811
## 3rd Qu.: 798760 3rd Qu.:1.0000
## Max. :1069460 Max. :1.0000
## developer publisher platforms required_age
## Length:27075 Length:27075 Length:27075 Min. : 0.0000
## Class :character Class :character Class :character 1st Qu.: 0.0000
## Mode :character Mode :character Mode :character Median : 0.0000
## Mean : 0.3549
## 3rd Qu.: 0.0000
## Max. :18.0000
## categories genres steamspy_tags achievements
## Length:27075 Length:27075 Length:27075 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 7.00
## Mean : 45.25
## 3rd Qu.: 23.00
## Max. :9821.00
## positive_ratings negative_ratings average_playtime median_playtime
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 6 1st Qu.: 2 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 24 Median : 9 Median : 0.0 Median : 0.0
## Mean : 1001 Mean : 211 Mean : 149.8 Mean : 146.1
## 3rd Qu.: 126 3rd Qu.: 42 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :2644404 Max. :487076 Max. :190625.0 Max. :190625.0
## owners price
## Length:27075 Min. : 0.000
## Class :character 1st Qu.: 1.690
## Mode :character Median : 3.990
## Mean : 6.078
## 3rd Qu.: 7.190
## Max. :421.990
# Check for missing values
missing_values <- colSums(is.na(steam_data))
missing_df <- data.frame(
Column = names(missing_values),
Missing_Values = missing_values
)
missing_df %>%
filter(Missing_Values > 0) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Column | Missing_Values |
|---|---|
| NA | NA |
| :—— | ————–: |
# Function to handle missing values
handle_missing_values <- function(df) {
# For numeric columns, replace NA with median
numeric_cols <- sapply(df, is.numeric)
for (col in names(df)[numeric_cols]) {
if (sum(is.na(df[[col]])) > 0) {
df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
}
}
# For categorical columns, replace NA with "Unknown"
categorical_cols <- sapply(df, is.character)
for (col in names(df)[categorical_cols]) {
if (sum(is.na(df[[col]])) > 0) {
df[[col]][is.na(df[[col]])] <- "Unknown"
}
}
return(df)
}
# Apply the function to handle missing values
steam_clean <- handle_missing_values(steam_data)
# Check missing values after handling
remaining_missing <- colSums(is.na(steam_clean))
cat("Remaining missing values after handling:", sum(remaining_missing))## Remaining missing values after handling: 0
# Process the release_date column to extract year
# First, convert to Date format, then extract year
steam_clean$release_date <- as.Date(steam_clean$release_date, format = "%Y-%m-%d", optional = TRUE)
steam_clean$release_year <- year(steam_clean$release_date)
# Handle missing years with median
median_year <- median(steam_clean$release_year, na.rm = TRUE)
steam_clean$release_year[is.na(steam_clean$release_year)] <- median_year
# Release year distribution
year_counts <- table(steam_clean$release_year)
year_df <- data.frame(
Year = as.numeric(names(year_counts)),
Count = as.numeric(year_counts)
)
# Create the interactive plot with plotly
year_plot <- year_df %>%
arrange(desc(Count)) %>%
head(10) %>%
plot_ly(
x = ~reorder(Year, -Count),
y = ~Count,
type = 'bar',
marker = list(
color = colorRampPalette(c("lightblue", "steelblue", "darkblue"))(10),
line = list(color = 'rgb(8,48,107)', width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Year:", Year, "<br>Number of Games:", Count)
) %>%
layout(
title = "Top 10 Years by Game Count",
xaxis = list(title = "Release Year", tickangle = 45),
yaxis = list(title = "Number of Games"),
hoverlabel = list(bgcolor = "white")
)
# Display the plotly plot
year_plot# Process the owners column to extract numeric representation
extract_owners_mean <- function(owners_range) {
if (is.na(owners_range)) {
return(NA)
}
# Extract numbers from the range (e.g., "10000-20000")
bounds <- strsplit(owners_range, "-")[[1]]
lower <- as.numeric(gsub(",", "", trimws(bounds[1])))
upper <- as.numeric(gsub(",", "", trimws(bounds[2])))
return((lower + upper) / 2)
}
steam_clean$owners_mean <- sapply(steam_clean$owners, extract_owners_mean)
# Ownership statistics
summary_owners <- summary(steam_clean$owners_mean)
summary_owners## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10000 10000 10000 134090 35000 150000000
# Create a price category variable
steam_clean$price_category <- cut(
steam_clean$price,
breaks = c(-0.01, 0.01, 10, 20, 30, 100, 1000),
labels = c("Free", "Budget", "Mid-range", "Premium", "AAA", "Collector")
)
# Price category distribution
price_cat_counts <- table(steam_clean$price_category)
price_cat_df <- data.frame(
Category = names(price_cat_counts),
Count = as.numeric(price_cat_counts)
)
# Create interactive plot for price categories
plot_ly(
price_cat_df,
x = ~reorder(Category, -Count),
y = ~Count,
type = "bar",
marker = list(
color = plasma(6, alpha = 0.8),
line = list(color = "rgb(58,54,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Category:", Category, "<br>Number of Games:", Count,
"<br>Percentage:", round(Count/sum(Count)*100, 1), "%")
) %>%
layout(
title = list(text = "Distribution of Price Categories", font = list(size = 16)),
xaxis = list(title = "Price Category", tickangle = 45),
yaxis = list(title = "Number of Games"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# The price distribution visualization reveals that budget-priced games (under $10) dominate the Steam
# marketplace, followed by free games. Premium-priced and AAA-priced games are much less common, indicating
# that most developers target the lower price tiers. The hover information shows both counts and percentages.# Calculate the ratio of positive to total ratings
steam_clean$total_ratings <- steam_clean$positive_ratings + steam_clean$negative_ratings
steam_clean$positive_ratio <- steam_clean$positive_ratings / steam_clean$total_ratings
# To avoid NaN from division by zero, replace with 0
steam_clean$positive_ratio[is.na(steam_clean$positive_ratio)] <- 0
# Positive ratings ratio statistics
summary(steam_clean$positive_ratio)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5833 0.7603 0.7145 0.8939 1.0000
# Create an interactive histogram for positive ratings ratio
plot_ly(
steam_clean,
x = ~positive_ratio,
type = "histogram",
nbinsx = 50,
marker = list(
color = "rgba(70, 130, 180, 0.7)",
line = list(color = "rgba(8, 48, 107, 1)", width = 1)
),
hoverinfo = "y"
) %>%
layout(
title = list(text = "Distribution of Positive Ratings Ratio", font = list(size = 16)),
xaxis = list(title = "Positive Ratings / Total Ratings", range = c(0, 1)),
yaxis = list(title = "Number of Games"),
bargap = 0.1
)# INSIGHTS:
# The ratings distribution shows a distinct right skew with most games having a positive ratings ratio
# above 0.7, suggesting that Steam users generally rate games positively or that lower-rated games
# may be removed from the platform. There's a noticeable peak near 1.0, indicating many games with
# very high approval ratings.# Process platforms into binary columns
steam_clean$has_windows <- as.integer(grepl("windows", tolower(steam_clean$platforms)))
steam_clean$has_mac <- as.integer(grepl("mac", tolower(steam_clean$platforms)))
steam_clean$has_linux <- as.integer(grepl("linux", tolower(steam_clean$platforms)))
# Platform distribution
platform_counts <- c(
Windows = sum(steam_clean$has_windows),
Mac = sum(steam_clean$has_mac),
Linux = sum(steam_clean$has_linux)
)
platform_df <- data.frame(
Platform = names(platform_counts),
Count = platform_counts
)
# Create interactive plot for platform distribution
plot_ly(
platform_df,
x = ~reorder(Platform, -Count),
y = ~Count,
type = "bar",
marker = list(
color = c("#2C7BB6", "#D7191C", "#FDAE61"),
line = list(color = "rgb(8,48,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Platform:", Platform, "<br>Number of Games:", Count,
"<br>Percentage of Total:", round(Count/max(Count)*100, 1), "%")
) %>%
layout(
title = list(text = "Games Available by Platform", font = list(size = 16)),
xaxis = list(title = "Platform"),
yaxis = list(title = "Number of Games"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# Windows dominates the platform availability with nearly all Steam games supporting it.
# Mac support is available for roughly half as many games, while Linux has the least support.
# This confirms Windows' continued dominance in PC gaming. The interactive chart shows the
# percentage of games supporting each platform relative to Windows.
# Create a platform count variable
steam_clean$platform_count <- steam_clean$has_windows + steam_clean$has_mac + steam_clean$has_linux
# Platform count distribution
platform_count_table <- table(steam_clean$platform_count)
platform_count_df <- data.frame(
Number_of_Platforms = as.numeric(names(platform_count_table)),
Count = as.numeric(platform_count_table)
)
# Create interactive plot for platform count
plot_ly(
platform_count_df,
x = ~as.factor(Number_of_Platforms),
y = ~Count,
type = "bar",
marker = list(
color = viridis(3, alpha = 0.8),
line = list(color = "rgb(8,48,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Platforms Supported:", Number_of_Platforms,
"<br>Number of Games:", Count,
"<br>Percentage:", round(Count/sum(Count)*100, 1), "%")
) %>%
layout(
title = list(text = "Number of Supported Platforms per Game", font = list(size = 16)),
xaxis = list(title = "Number of Platforms"),
yaxis = list(title = "Number of Games"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# This visualization shows that while many games are Windows-only (supporting just one platform),
# a significant number of games support all three platforms (Windows, Mac, and Linux). Very few games
# support exactly two platforms, suggesting developers either focus solely on Windows or aim for
# full cross-platform compatibility.# Function to get top categories/genres
get_top_categories <- function(column, n = 10) {
all_categories <- unlist(strsplit(na.omit(steam_clean[[column]]), ";"))
all_categories <- trimws(all_categories)
cat_counts <- table(all_categories)
return(sort(cat_counts, decreasing = TRUE)[1:n])
}
# Top 10 Categories
top_categories <- get_top_categories("categories", 10)
cat_df <- data.frame(
Category = names(top_categories),
Count = as.numeric(top_categories)
)
# Create interactive plot for categories
cat_plot <- plot_ly(
cat_df,
x = ~reorder(Category, -Count),
y = ~Count,
type = 'bar',
marker = list(
color = viridis(10, alpha = 0.8),
line = list(color = 'rgb(8,48,107)', width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Category:", Category, "<br>Count:", Count)
) %>%
layout(
title = list(text = "Top 10 Game Categories", font = list(size = 16)),
xaxis = list(title = "Category", tickangle = 45),
yaxis = list(title = "Count"),
hoverlabel = list(bgcolor = "white")
)
cat_plot# Top 10 Genres
top_genres <- get_top_categories("genres", 10)
genre_df <- data.frame(
Genre = names(top_genres),
Count = as.numeric(top_genres)
)
# Create interactive plot for genres
genre_plot <- plot_ly(
genre_df,
x = ~reorder(Genre, -Count),
y = ~Count,
type = 'bar',
marker = list(
color = plasma(10, alpha = 0.8),
line = list(color = 'rgb(58,54,107)', width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Genre:", Genre, "<br>Count:", Count)
) %>%
layout(
title = list(text = "Top 10 Game Genres", font = list(size = 16)),
xaxis = list(title = "Genre", tickangle = 45),
yaxis = list(title = "Count"),
hoverlabel = list(bgcolor = "white")
)
genre_plot# Create binary columns for top genres
for (genre in names(top_genres)[1:5]) {
genre_col_name <- paste0("genre_", genre)
steam_clean[[genre_col_name]] <- as.integer(grepl(genre, steam_clean$genres, fixed = TRUE))
}
# INSIGHTS:
# The interactive bar charts reveal that "Single-player" is by far the most common game category,
# followed by "Steam Achievements" and "Steam Trading Cards". For genres, "Indie" dominates the
# Steam platform, followed by "Action" and "Casual". This shows the significant presence of
# independent developers on Steam and the popularity of action-oriented gameplay.# Distribution of games by release year
year_data <- data.frame(table(steam_clean$release_year))
names(year_data) <- c("Year", "Count")
year_data$Year <- as.numeric(as.character(year_data$Year))
year_data <- year_data[year_data$Year >= 1990 & year_data$Year <= 2024, ]
# Create interactive plot for release year trends
year_data_filtered <- year_data
color_scale <- colorRampPalette(c("lightblue", "steelblue", "darkblue"))(length(year_data_filtered$Year))
colors_by_year <- color_scale[rank(year_data_filtered$Count)]
plot_ly(
year_data_filtered,
x = ~Year,
y = ~Count,
type = "scatter",
mode = "lines+markers",
line = list(color = 'steelblue', width = 2),
marker = list(color = colors_by_year, size = 8),
hoverinfo = "text",
hovertext = ~paste("Year:", Year, "<br>Number of Games:", Count)
) %>%
layout(
title = list(text = "Number of Games Released by Year", font = list(size = 16)),
xaxis = list(title = "Release Year", tickmode = "array", tickvals = seq(1990, 2024, by = 5)),
yaxis = list(title = "Number of Games"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# The timeline reveals a dramatic acceleration in game releases starting around 2012, with an exponential
# increase through 2018. This corresponds to Steam's growth and the rise of indie game development.
# The visualization shows how the platform evolved from having few releases in the early years to
# becoming a major publishing platform by the mid-2010s.# Create interactive histogram for price distribution
plot_ly(
steam_clean,
x = ~price,
type = "histogram",
nbinsx = 50,
marker = list(
color = "rgba(70, 130, 180, 0.7)",
line = list(color = "rgba(8, 48, 107, 1)", width = 1)
),
hoverinfo = "y"
) %>%
layout(
title = list(text = "Distribution of Game Prices", font = list(size = 16)),
xaxis = list(title = "Price (USD)", range = c(0, 100)),
yaxis = list(title = "Number of Games"),
bargap = 0.1
)# Price distribution with log scale (interactive)
plot_ly(
steam_clean,
x = ~price,
type = "histogram",
nbinsx = 50,
marker = list(
color = "rgba(70, 130, 180, 0.7)",
line = list(color = "rgba(8, 48, 107, 1)", width = 1)
),
hoverinfo = "y"
) %>%
layout(
title = list(text = "Distribution of Game Prices (Log Scale)", font = list(size = 16)),
xaxis = list(title = "Price (USD)", range = c(0, 100)),
yaxis = list(title = "Number of Games", type = "log"),
bargap = 0.1
)# INSIGHTS:
# The price distribution is heavily skewed, with most games priced below $20. There's a significant
# concentration at certain price points (e.g., $0, $9.99, $19.99), suggesting strategic pricing by developers
# at these psychological price points. Very few games are priced above $60, which aligns with traditional
# AAA game pricing limits.# Create a subset with fewer points for better visualization
set.seed(123)
price_rating_sample <- steam_clean[sample(nrow(steam_clean), min(5000, nrow(steam_clean))), ]
# Create interactive scatterplot for price vs ratings
plot_ly(
price_rating_sample,
x = ~price,
y = ~positive_ratio,
type = "scatter",
mode = "markers",
marker = list(
color = ~price,
colorscale = "Viridis",
size = 7,
opacity = 0.7,
showscale = TRUE,
colorbar = list(title = "Price ($)")
),
hoverinfo = "text",
hovertext = ~paste("Price: $", price, "<br>Positive Ratio:", round(positive_ratio, 2),
"<br>Name:", name)
) %>%
layout(
title = list(text = "Relationship between Price and Positive Ratings Ratio", font = list(size = 16)),
xaxis = list(title = "Price (USD)", range = c(0, 100)),
yaxis = list(title = "Positive Ratings Ratio"),
hoverlabel = list(bgcolor = "white")
)# Create interactive boxplot for ratings by price category
plot_ly(
steam_clean,
y = ~positive_ratio,
color = ~price_category,
type = "box",
colors = viridis(6),
hoverinfo = "text",
hovertext = ~paste("Category:", price_category,
"<br>Median Positive Ratio:",
round(median(positive_ratio[price_category == price_category], na.rm = TRUE), 2))
) %>%
layout(
title = list(text = "Positive Ratings Ratio by Price Category", font = list(size = 16)),
xaxis = list(title = "Price Category"),
yaxis = list(title = "Positive Ratings Ratio"),
hoverlabel = list(bgcolor = "white"),
showlegend = FALSE
)# INSIGHTS:
# The scatterplot reveals a slight positive correlation between price and ratings, with higher-priced
# games generally receiving better ratings. However, there's considerable variability, especially
# in the $10-30 range. The boxplot further confirms that premium and AAA-priced games tend to have
# higher median positive ratings than free or budget titles, suggesting higher-priced games generally
# deliver better quality experiences.# Average playtime by genre
# Get the top 5 genre names
top_5_genres <- names(top_genres)[1:5]
genre_time <- data.frame(Genre = character(), AveragePlaytime = numeric())
for (genre in top_5_genres) {
genre_col <- paste0("genre_", genre)
avg_time <- mean(steam_clean$average_playtime[steam_clean[[genre_col]] == 1], na.rm = TRUE)
genre_time <- rbind(genre_time, data.frame(Genre = genre, AveragePlaytime = avg_time))
}
genre_time <- genre_time[order(-genre_time$AveragePlaytime),]
# Create interactive horizontal bar chart for genre playtime
plot_ly(
genre_time,
y = ~reorder(Genre, AveragePlaytime),
x = ~AveragePlaytime,
type = "bar",
orientation = "h",
marker = list(
color = viridis(nrow(genre_time), alpha = 0.8),
line = list(color = "rgb(8,48,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Genre:", Genre, "<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
"<br>(", round(AveragePlaytime/60, 1), "hours)")
) %>%
layout(
title = list(text = "Average Playtime by Genre", font = list(size = 16)),
yaxis = list(title = ""),
xaxis = list(title = "Average Playtime (minutes)"),
hoverlabel = list(bgcolor = "white")
)# Average playtime by price category
price_time <- steam_clean %>%
group_by(price_category) %>%
summarize(AveragePlaytime = mean(average_playtime, na.rm = TRUE)) %>%
arrange(desc(AveragePlaytime))
# Create interactive horizontal bar chart for price category playtime
plot_ly(
price_time,
y = ~reorder(price_category, AveragePlaytime),
x = ~AveragePlaytime,
type = "bar",
orientation = "h",
marker = list(
color = plasma(nrow(price_time), alpha = 0.8),
line = list(color = "rgb(58,54,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Price Category:", price_category,
"<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
"<br>(", round(AveragePlaytime/60, 1), "hours)")
) %>%
layout(
title = list(text = "Average Playtime by Price Category", font = list(size = 16)),
yaxis = list(title = ""),
xaxis = list(title = "Average Playtime (minutes)"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# The genre analysis reveals which types of games tend to be most engaging for players, with certain genres
# showing significantly higher average playtimes. The price category visualization demonstrates that
# higher-priced games generally maintain player engagement for longer periods, likely due to more content
# or depth of gameplay.# Average playtime by platform count
platform_time <- steam_clean %>%
group_by(platform_count) %>%
summarize(
AveragePlaytime = mean(average_playtime, na.rm = TRUE),
Count = n()
)
# Create interactive bar chart for platform count vs playtime
plot_ly(
platform_time,
x = ~as.factor(platform_count),
y = ~AveragePlaytime,
type = "bar",
marker = list(
color = viridis(nrow(platform_time), alpha = 0.8),
line = list(color = "rgb(8,48,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Platforms:", platform_count,
"<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
"<br>(", round(AveragePlaytime/60, 1), "hours)",
"<br>Number of Games:", Count)
) %>%
layout(
title = list(text = "Average Playtime by Platform Count", font = list(size = 16)),
xaxis = list(title = "Number of Platforms"),
yaxis = list(title = "Average Playtime (minutes)"),
hoverlabel = list(bgcolor = "white")
)# Average ratings by platform count
platform_ratings <- steam_clean %>%
group_by(platform_count) %>%
summarize(
AveragePositiveRatio = mean(positive_ratio, na.rm = TRUE),
Count = n()
)
# Create interactive bar chart for platform count vs ratings
plot_ly(
platform_ratings,
x = ~as.factor(platform_count),
y = ~AveragePositiveRatio,
type = "bar",
marker = list(
color = plasma(nrow(platform_ratings), alpha = 0.8),
line = list(color = "rgb(58,54,107)", width = 1.5)
),
hoverinfo = "text",
hovertext = ~paste("Platforms:", platform_count,
"<br>Average Positive Ratio:", round(AveragePositiveRatio, 3),
"<br>Number of Games:", Count)
) %>%
layout(
title = list(text = "Average Positive Ratings Ratio by Platform Count", font = list(size = 16)),
xaxis = list(title = "Number of Platforms"),
yaxis = list(title = "Average Positive Ratings Ratio"),
hoverlabel = list(bgcolor = "white")
)# INSIGHTS:
# Games that support multiple platforms tend to have both higher average playtimes and higher positive
# ratings ratios. This may indicate that developers who invest in multi-platform support also invest more
# in overall game quality and content depth, or that the broader accessibility leads to more diverse
# player feedback.# Prepare data for fitting (remove 0s and 1s for beta distribution)
fit_data <- steam_clean$positive_ratio
fit_data <- fit_data[fit_data > 0.01 & fit_data < 0.99]
# Fit distributions
fit_norm <- fitdist(fit_data, "norm")
fit_beta <- fitdist(fit_data, "beta")
fit_gamma <- fitdist(fit_data, "gamma")
# Compare models using AIC
aic_comparison <- data.frame(
Distribution = c("Normal", "Beta", "Gamma"),
AIC = c(fit_norm$aic, fit_beta$aic, fit_gamma$aic)
)
aic_comparison %>%
arrange(AIC) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Distribution | AIC |
|---|---|
| Beta | -15963.267 |
| Normal | -10130.781 |
| Gamma | -4588.201 |
# Determine best fit
best_aic <- min(fit_norm$aic, fit_beta$aic, fit_gamma$aic)
best_dist <- c("Normal", "Beta", "Gamma")[c(fit_norm$aic, fit_beta$aic, fit_gamma$aic) == best_aic]
cat(sprintf("Best fitting distribution: %s\n", best_dist))## Best fitting distribution: Beta
# Plot the distributions - adjust margins and layout
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
plot.legend <- c("Normal", "Beta", "Gamma")
# Individual plots with separate error handling
tryCatch({
denscomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in density plot")
text(1, 1, "Margins too large for density plot")
})
tryCatch({
qqcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in QQ plot")
text(1, 1, "Margins too large for QQ plot")
})
tryCatch({
cdfcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in CDF plot")
text(1, 1, "Margins too large for CDF plot")
})
tryCatch({
ppcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in PP plot")
text(1, 1, "Margins too large for PP plot")
})
# Add overall title
title("Comparison of Distribution Fits", outer = TRUE)# INSIGHTS:
# The distribution fitting analysis reveals which statistical distribution best models the positive ratings
# ratio of Steam games. The AIC values indicate that the Beta distribution provides the best fit, which is
# expected for data bounded between 0 and 1. This suggests that ratings follow a predictable pattern that
# can be modeled for future game performance prediction.